Título: Identificación de biomarcadores de expresión génica asociadas al adenocarcinoma de pulmón mediante análisis RNA-seq y validación funcional in silico
Datos:
Pipeline:
Autor: José Eduardo Hidalgo Suero
Máster: Bioestadística y Bioinformática
Institución: Centro Europeo de Másteres y Postgrado (CEMP)
Repositorio: GitHub - pipeline-rnaseq-luad-tfm
Resultados principales:
Software utilizado:
Dashboard generado con flexdashboard
---
title: "Pipeline RNA-seq: Biomarcadores en LUAD"
output:
flexdashboard::flex_dashboard:
orientation: columns
vertical_layout: fill
theme: cosmo
source_code: embed
---
```{r setup, include=FALSE}
library(flexdashboard)
library(plotly)
library(DT)
library(dplyr)
library(ggplot2)
library(DESeq2)
library(pheatmap)
library(heatmaply)
library(RColorBrewer)
# Cargar datos
degs <- read.csv("DEGs_con_simbolos_completo.csv", stringsAsFactors = FALSE)
go_bp <- read.csv("GO_BP_results.csv", stringsAsFactors = FALSE)
go_mf <- read.csv("GO_MF_results.csv", stringsAsFactors = FALSE)
go_cc <- read.csv("GO_CC_results.csv", stringsAsFactors = FALSE)
kegg <- read.csv("KEGG_results.csv", stringsAsFactors = FALSE)
gsea_go <- read.csv("GSEA_GO_results.csv", stringsAsFactors = FALSE)
gsea_kegg <- read.csv("GSEA_KEGG_results.csv", stringsAsFactors = FALSE)
# Cargar matriz de conteos crudos
counts_raw <- read.csv("raw_counts_matrix.csv", row.names = 1, stringsAsFactors = FALSE)
# Crear metadatos de muestras (Tumor: SRR3474xxx y SRR34750xx-SRR3475138, Normal: SRR3475323+)
sample_names <- colnames(counts_raw)
condition <- ifelse(grepl("^SRR3475[3-9]", sample_names), "Normal", "Tumor")
metadata <- data.frame(
sample = sample_names,
condition = factor(condition, levels = c("Normal", "Tumor")),
row.names = sample_names
)
# Crear objeto DESeq2 y normalizar con VST
dds <- DESeqDataSetFromMatrix(countData = as.matrix(counts_raw),
colData = metadata,
design = ~ condition)
dds <- dds[rowSums(counts(dds)) >= 10, ]
vst_data <- vst(dds, blind = TRUE)
# PCA data
pca_result <- prcomp(t(assay(vst_data)))
pca_df <- data.frame(
PC1 = pca_result$x[,1],
PC2 = pca_result$x[,2],
sample = rownames(pca_result$x),
condition = metadata[rownames(pca_result$x), "condition"]
)
var_explained <- round(100 * summary(pca_result)$importance[2, 1:2], 1)
```
# Resumen
## Column {data-width=350}
### Información del Estudio
**Título:** Identificación de biomarcadores de expresión génica asociadas al adenocarcinoma de pulmón mediante análisis RNA-seq y validación funcional in silico
**Datos:**
- Fuente: GEO (PRJNA320473)
- Muestras: 20 (10 normal + 10 tumor)
- Plataforma: Illumina HiSeq 2500
- Genoma: GRCh38.p14
**Pipeline:**
1. Control de calidad (FastQC/MultiQC)
2. Alineamiento (Rsubread)
3. Cuantificación (featureCounts)
4. Expresión diferencial (DESeq2)
5. Enriquecimiento funcional (clusterProfiler)
### DEGs Identificados
```{r}
n_up <- sum(degs$regulacion == "Up", na.rm = TRUE)
n_down <- sum(degs$regulacion == "Down", na.rm = TRUE)
n_total <- nrow(degs)
valueBox(n_total, caption = "Total DEGs", icon = "fa-dna", color = "primary")
```
### Sobreexpresados
```{r}
valueBox(n_up, caption = "Genes Up", icon = "fa-arrow-up", color = "danger")
```
### Subexpresados
```{r}
valueBox(n_down, caption = "Genes Down", icon = "fa-arrow-down", color = "info")
```
## Column {data-width=650}
### Distribución de Fold Change
```{r}
p <- ggplot(degs, aes(x = log2FoldChange, fill = regulacion)) +
geom_histogram(bins = 50, alpha = 0.7) +
scale_fill_manual(values = c("Up" = "#E74C3C", "Down" = "#3498DB")) +
theme_minimal() +
labs(x = "log2 Fold Change", y = "Frecuencia", fill = "Regulación") +
theme(legend.position = "bottom")
ggplotly(p)
```
### Top 10 Genes por Fold Change
```{r}
top_genes <- degs %>%
arrange(desc(abs(log2FoldChange))) %>%
head(10) %>%
select(symbol, log2FoldChange, padj, regulacion)
datatable(top_genes,
options = list(pageLength = 10, dom = 't'),
rownames = FALSE,
colnames = c("Gen", "log2FC", "p-adj", "Regulación"))
```
# Expresión Diferencial
## Column {data-width=500}
### PCA: Tumor vs Normal
```{r}
p_pca <- ggplot(pca_df, aes(x = PC1, y = PC2, color = condition, text = sample)) +
geom_point(size = 4, alpha = 0.8) +
scale_color_manual(values = c("Normal" = "#00BFC4", "Tumor" = "#F8766D")) +
theme_minimal() +
labs(x = paste0("PC1: ", var_explained[1], "% varianza"),
y = paste0("PC2: ", var_explained[2], "% varianza"),
color = "Condición") +
theme(legend.position = "bottom")
ggplotly(p_pca, tooltip = c("text", "x", "y"))
```
## Column {data-width=500}
### Volcano Plot Interactivo
```{r}
degs$neg_log10_padj <- -log10(as.numeric(degs$padj))
degs$neg_log10_padj[is.infinite(degs$neg_log10_padj)] <- max(degs$neg_log10_padj[is.finite(degs$neg_log10_padj)]) + 10
p_volcano <- ggplot(degs, aes(x = log2FoldChange, y = neg_log10_padj,
color = regulacion, text = symbol)) +
geom_point(alpha = 0.6, size = 1) +
scale_color_manual(values = c("Up" = "#E74C3C", "Down" = "#3498DB", "NS" = "gray70")) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", color = "gray40") +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "gray40") +
theme_minimal() +
labs(x = "log2 Fold Change", y = "-log10(p-adj)", color = "Regulación") +
theme(legend.position = "bottom")
ggplotly(p_volcano, tooltip = c("text", "x", "y"))
```
# Heatmap
## Column {data-width=700}
### Heatmap Interactivo - Top 50 DEGs
```{r, fig.height=8}
library(heatmaply)
# Obtener top 50 DEGs por p-valor
top50_degs <- degs %>%
arrange(padj) %>%
head(50)
top50_genes <- top50_degs$gene_id
top50_symbols <- top50_degs$symbol
# Filtrar matriz VST
vst_matrix <- assay(vst_data)
top50_matrix <- vst_matrix[rownames(vst_matrix) %in% top50_genes, ]
# Renombrar filas con símbolos de genes
gene_id_to_symbol <- setNames(top50_degs$symbol, top50_degs$gene_id)
rownames(top50_matrix) <- gene_id_to_symbol[rownames(top50_matrix)]
# Escalar por filas (z-score)
top50_scaled <- t(scale(t(top50_matrix)))
# Anotación de columnas
col_annotation <- data.frame(
Condición = metadata$condition,
row.names = rownames(metadata)
)
# Heatmap interactivo
heatmaply(top50_scaled,
row_side_colors = NULL,
col_side_colors = col_annotation,
colors = colorRampPalette(c("#3498DB", "white", "#E74C3C"))(100),
dendrogram = "both",
showticklabels = c(TRUE, TRUE),
fontsize_row = 7,
fontsize_col = 8,
margins = c(100, 150, 50, 50),
main = "Top 50 DEGs (z-score)")
```
## Column {data-width=300}
### Tabla Top 50 DEGs
```{r}
top50_tabla <- top50_degs %>%
select(symbol, log2FoldChange, padj, regulacion) %>%
mutate(log2FoldChange = round(log2FoldChange, 3))
datatable(top50_tabla,
options = list(pageLength = 15, scrollX = TRUE, dom = 'tip'),
rownames = FALSE,
colnames = c("Gen", "log2FC", "p-adj", "Regulación"))
```
# Tabla DEGs
## Column
### Tabla Completa de DEGs (filtrable)
```{r}
degs_tabla <- degs %>%
select(symbol, gene_id, log2FoldChange, padj, baseMean, regulacion) %>%
mutate(log2FoldChange = round(log2FoldChange, 3),
baseMean = round(baseMean, 2))
datatable(degs_tabla,
filter = 'top',
options = list(pageLength = 20, scrollX = TRUE),
rownames = FALSE,
colnames = c("Símbolo", "Gene ID", "log2FC", "p-adj", "baseMean", "Regulación"))
```
# Gene Ontology
## Column {.tabset}
### Procesos Biológicos (BP)
```{r}
go_bp_plot <- go_bp %>%
arrange(p.adjust) %>%
head(20) %>%
mutate(Description = factor(Description, levels = rev(Description)))
p_bp <- ggplot(go_bp_plot, aes(x = Count, y = Description, fill = p.adjust)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "#E74C3C", high = "#3498DB") +
theme_minimal() +
labs(x = "Número de genes", y = "", fill = "p-adj") +
theme(axis.text.y = element_text(size = 8))
ggplotly(p_bp)
```
### Funciones Moleculares (MF)
```{r}
go_mf_plot <- go_mf %>%
arrange(p.adjust) %>%
head(20) %>%
mutate(Description = factor(Description, levels = rev(Description)))
p_mf <- ggplot(go_mf_plot, aes(x = Count, y = Description, fill = p.adjust)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "#E74C3C", high = "#3498DB") +
theme_minimal() +
labs(x = "Número de genes", y = "", fill = "p-adj") +
theme(axis.text.y = element_text(size = 8))
ggplotly(p_mf)
```
### Componentes Celulares (CC)
```{r}
go_cc_plot <- go_cc %>%
arrange(p.adjust) %>%
head(20) %>%
mutate(Description = factor(Description, levels = rev(Description)))
p_cc <- ggplot(go_cc_plot, aes(x = Count, y = Description, fill = p.adjust)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "#E74C3C", high = "#3498DB") +
theme_minimal() +
labs(x = "Número de genes", y = "", fill = "p-adj") +
theme(axis.text.y = element_text(size = 8))
ggplotly(p_cc)
```
### Tabla GO Completa
```{r}
go_all <- bind_rows(
go_bp %>% mutate(Ontology = "BP"),
go_mf %>% mutate(Ontology = "MF"),
go_cc %>% mutate(Ontology = "CC")
) %>%
select(Ontology, ID, Description, Count, p.adjust) %>%
mutate(p.adjust = signif(p.adjust, 3))
datatable(go_all,
filter = 'top',
options = list(pageLength = 15, scrollX = TRUE),
rownames = FALSE,
colnames = c("Ontología", "ID", "Descripción", "Genes", "p-adj"))
```
# Vías KEGG
## Column {data-width=600}
### Top 20 Vías KEGG
```{r}
kegg_plot <- kegg %>%
arrange(p.adjust) %>%
head(20) %>%
mutate(Description = factor(Description, levels = rev(Description)))
p_kegg <- ggplot(kegg_plot, aes(x = Count, y = Description, fill = p.adjust)) +
geom_bar(stat = "identity") +
scale_fill_gradient(low = "#E74C3C", high = "#3498DB") +
theme_minimal() +
labs(x = "Número de genes", y = "", fill = "p-adj") +
theme(axis.text.y = element_text(size = 9))
ggplotly(p_kegg)
```
## Column {data-width=400}
### Tabla de Vías KEGG
```{r}
kegg_tabla <- kegg %>%
select(ID, Description, Count, p.adjust, category) %>%
mutate(p.adjust = signif(p.adjust, 3))
datatable(kegg_tabla,
filter = 'top',
options = list(pageLength = 15, scrollX = TRUE),
rownames = FALSE,
colnames = c("ID", "Vía", "Genes", "p-adj", "Categoría"))
```
# GSEA
## Column {.tabset}
### GSEA - GO
```{r}
gsea_go_plot <- gsea_go %>%
arrange(p.adjust) %>%
head(30) %>%
mutate(Direction = ifelse(NES > 0, "Activado", "Suprimido"),
Description = factor(Description, levels = rev(Description)))
p_gsea_go <- ggplot(gsea_go_plot, aes(x = NES, y = Description, fill = Direction)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("Activado" = "#E74C3C", "Suprimido" = "#3498DB")) +
theme_minimal() +
labs(x = "Normalized Enrichment Score (NES)", y = "", fill = "Dirección") +
theme(axis.text.y = element_text(size = 7))
ggplotly(p_gsea_go)
```
### GSEA - KEGG
```{r}
gsea_kegg_plot <- gsea_kegg %>%
arrange(p.adjust) %>%
head(30) %>%
mutate(Direction = ifelse(NES > 0, "Activado", "Suprimido"),
Description = factor(Description, levels = rev(Description)))
p_gsea_kegg <- ggplot(gsea_kegg_plot, aes(x = NES, y = Description, fill = Direction)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("Activado" = "#E74C3C", "Suprimido" = "#3498DB")) +
theme_minimal() +
labs(x = "Normalized Enrichment Score (NES)", y = "", fill = "Dirección") +
theme(axis.text.y = element_text(size = 8))
ggplotly(p_gsea_kegg)
```
### Tabla GSEA Completa
```{r}
gsea_all <- bind_rows(
gsea_go %>% mutate(Source = "GO"),
gsea_kegg %>% mutate(Source = "KEGG")
) %>%
mutate(Direction = ifelse(NES > 0, "Activado", "Suprimido")) %>%
select(Source, ID, Description, NES, p.adjust, Direction) %>%
mutate(NES = round(NES, 3),
p.adjust = signif(p.adjust, 3))
datatable(gsea_all,
filter = 'top',
options = list(pageLength = 15, scrollX = TRUE),
rownames = FALSE,
colnames = c("Fuente", "ID", "Descripción", "NES", "p-adj", "Dirección"))
```
# Acerca de
## Column
### Información del Proyecto
**Autor:** José Eduardo Hidalgo Suero
**Máster:** Bioestadística y Bioinformática
**Institución:** Centro Europeo de Másteres y Postgrado (CEMP)
---
**Repositorio:** [GitHub - pipeline-rnaseq-luad-tfm](https://github.com/Licen03/pipeline-rnaseq-luad-tfm)
---
**Resultados principales:**
- **6,989 DEGs** identificados (4,328 sobreexpresados, 2,661 subexpresados)
- **Genes destacados:** MMP13, COL11A1, COL1A1 (asociados a remodelación de matriz extracelular)
- **Vía PI3K-Akt** significativamente enriquecida
- **Supresión de vías inmunológicas** (presentación de antígeno, MHC)
---
**Software utilizado:**
- R 4.4.2 / RStudio
- DESeq2 1.46.0
- clusterProfiler 4.14.6
- flexdashboard
- plotly / DT
---
*Dashboard generado con flexdashboard*